1 Effect of UPSTM-Based Decorrelation on Feature Discovery

1.0.1 Loading the libraries

library("FRESA.CAD")
library(readxl)
library(igraph)
library(umap)
library(tsne)
library(entropy)

op <- par(no.readonly = TRUE)
pander::panderOptions('digits', 3)
pander::panderOptions('table.split.table', 400)
pander::panderOptions('keep.trailing.zeros',TRUE)

1.1 Material and Methods

1.2 The Data

Data source:

https://www.kaggle.com/datasets/cdc/national-health-and-nutrition-examination-survey?select=labs.csv

Details:

https://www.cdc.gov/Nchs/Nhanes/about_nhanes.htm

demographic <- as.data.frame(read_excel("~/GitHub/LatentBiomarkers/Data/NHNES/demographic.xlsx"))
demographiDesc <- as.data.frame(read_excel("~/GitHub/LatentBiomarkers/Data/NHNES/demographic.xlsx",sheet = "Sheet1"))

rownames(demographic) <- demographic$SEQN
demographic$SEQN <- NULL

vartokeep <- demographiDesc[demographiDesc$Keep=="Yes",1]
demographic <- demographic[,vartokeep]

keepColumn <- apply(!is.na(demographic),2,sum)/nrow(demographic) > 0.75
demographic <- demographic[,keepColumn]
demographic <- demographic[complete.cases(demographic),]

## Examination data
examination <- as.data.frame(read_excel("~/GitHub/LatentBiomarkers/Data/NHNES/examination.xlsx"))
examinationDesc <- as.data.frame(read_excel("~/GitHub/LatentBiomarkers/Data/NHNES/examination.xlsx",sheet = "Sheet1"))
rownames(examination) <- examination$SEQN
examination$SEQN <- NULL

bpresDI <- examination[,c("BPXDI1","BPXDI2","BPXDI3","BPXDI4")]
examination[,c("BPXDI1","BPXDI2","BPXDI3","BPXDI4")] <- NULL
bpresDIM <- apply(bpresDI,1,mean,na.rm=TRUE)
summary(bpresDIM)
#>    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
#>    0.00   58.00   66.67   65.44   74.67  128.00    2282

bpresSY <- examination[,c("BPXSY1","BPXSY2","BPXSY3","BPXSY4")]
examination[,c("BPXSY1","BPXSY2","BPXSY3","BPXSY4")] <- NULL
bpresSYM <- apply(bpresSY,1,mean,na.rm=TRUE)
summary(bpresSYM)
#>    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
#>   64.67  106.00  115.33  118.31  128.00  228.67    2282

examination$PEASCST1 <- NULL
examination$bpresDIM <- bpresDIM
examination$bpresSYM <- bpresSYM
  
keepColumn <- apply(!is.na(examination),2,sum)/nrow(examination) > 0.75
examination <- examination[,keepColumn]
keepColumnVar <- !is.na(apply(examination,2,var,na.rm = TRUE))
examination <- examination[,keepColumnVar]
examination <- examination[complete.cases(examination),]
examination <- examination[examination$bpresDIM>0,]



## questionnaire data
questionnaire <- as.data.frame(read_excel("~/GitHub/LatentBiomarkers/Data/NHNES/questionnaire.xlsx"))
questionnaireDesc <- as.data.frame(read_excel("~/GitHub/LatentBiomarkers/Data/NHNES/questionnaire.xlsx",sheet = "Sheet1"))
rownames(questionnaire) <- questionnaire$SEQN
questionnaire$SEQN <- NULL
table(questionnaire$MCQ010)
#> 
#>    1    2    7    9 
#> 1538 8222    1    8

#keepColumn <- apply(!is.na(questionnaire),2,sum)/nrow(questionnaire) > 0.85
#questionnaire <- questionnaire[,keepColumn]
#keepColumnVar <- !is.na(apply(questionnaire,2,var,na.rm = TRUE))
#questionnaire <- questionnaire[,keepColumnVar]
#questionnaire <- questionnaire[complete.cases(questionnaire),]
#keeprow <- apply(questionnaire,2,max) < 9000
#questionnaire <- questionnaire[keeprow,]
#table(questionnaire$MCQ010)

## Diabetes
Diabetes <- questionnaire$MCQ010
names(Diabetes) <- rownames(questionnaire) 
Diabetes <- Diabetes[!is.na(Diabetes)]
Diabetes <- Diabetes[Diabetes<3]
table(Diabetes)
#> Diabetes
#>    1    2 
#> 1538 8222

## Depression 
#depression <- questionnaire[,c("DPQ010","DPQ020","DPQ030","DPQ040","DPQ050","DPQ060","DPQ070","DPQ080","DPQ090")]
#donnow <- apply(is.na(depression),1,sum) == 9
#depressionTot <- apply(depression>0,1,sum,na.rm=TRUE) > 0

depression <- questionnaire[,c("DPQ040")]
names(depression) <- rownames(questionnaire)
depression[depression>5] <- "NA"
depressionTot <- 1*(depression[!is.na(depression)] > 0)


table(depressionTot)
#> depressionTot
#>    0    1 
#> 2650 2745

#depressionTot <- depressionTot[!donnow]
#table(depressionTot)


## The labs data
labs <- as.data.frame(read_excel("~/GitHub/LatentBiomarkers/Data/NHNES/labs.xlsx"))
labsDesc <- as.data.frame(read_excel("~/GitHub/LatentBiomarkers/Data/NHNES/labs.xlsx",sheet = "Sheet1"))

rownames(labs) <- labs$SEQN
labs$SEQN <- NULL

keepColumn <- apply(!is.na(labs),2,sum)/nrow(labs) > 0.75
labs <- labs[,keepColumn]
keepColumnVar <- !is.na(apply(labs,2,var,na.rm = TRUE))
labs <- labs[,keepColumnVar]
labs <- labs[complete.cases(labs),]

includeIDS <- rownames(labs)[rownames(labs) %in% rownames(examination)]
includeIDS <- includeIDS[includeIDS %in% rownames(demographic)]

includeIDS <- includeIDS[includeIDS %in% names(Diabetes)]

Diab <- Diabetes[includeIDS]
table(Diab)
#> Diab
#>    1    2 
#>  992 5006

NHNES <- cbind(demographic[includeIDS,],examination[includeIDS,],labs[includeIDS,]) 
NHNES$Diab <- Diab

table(NHNES$Diab)
#> 
#>    1    2 
#>  992 5006

NHNES <- NHNES[NHNES$RIDAGEYR >= 20,]

NHNES$Diab <- 1*(NHNES$Diab == 1)

table(NHNES$Diab)
#> 
#>    0    1 
#> 3664  650
includeIDS <- rownames(NHNES)
includeIDS <- includeIDS[includeIDS %in% names(depressionTot)]
depressionTot <- depressionTot[includeIDS]
NHNESDep <- cbind(NHNES[includeIDS,],depressionTot)

NHNESDep$depressionTot <- 1*NHNESDep$depressionTot

tbdep <- table(NHNESDep$depressionTot)
genderDiab <- NHNESDep[,c("RIAGENDR","Diab","depressionTot")]

isContinuous <- sapply(apply(NHNESDep,2,unique),length) > 4
sum(isContinuous)
#> [1] 62

NHNESDep <- cbind(NHNESDep[,isContinuous],genderDiab)

table(NHNESDep$depressionTot)
#> 
#>    0    1 
#> 2072 2080

NHNESDepMales <- subset(NHNESDep,RIAGENDR==1)
table(NHNESDepMales$depressionTot)
#> 
#>    0    1 
#> 1191  867

NHNESDepFemales <- subset(NHNESDep,RIAGENDR==2)
table(NHNESDepFemales$depressionTot)
#> 
#>    0    1 
#>  881 1213

1.2.0.1 Standarize the names for the reporting

studyName <- "NHNES"
dataframe <- NHNESDepFemales
outcome <- "depressionTot"

TopVariables <- 10

thro <- 0.40
cexheat = 0.15

1.3 Generaring the report

1.3.1 Libraries

Some libraries

library(psych)
library(whitening)
library("vioplot")
library("rpart")

1.3.2 Data specs

pander::pander(c(rows=nrow(dataframe),col=ncol(dataframe)-1))
rows col
2094 64
pander::pander(table(dataframe[,outcome]))
0 1
881 1213

varlist <- colnames(dataframe)
varlist <- varlist[varlist != outcome]

largeSet <- length(varlist) > 1500 

1.3.3 Scaling the data

Scaling and removing near zero variance columns and highly co-linear(r>0.99999) columns


  ### Some global cleaning
  sdiszero <- apply(dataframe,2,sd) > 1.0e-16
  dataframe <- dataframe[,sdiszero]

  varlist <- colnames(dataframe)[colnames(dataframe) != outcome]
  tokeep <- c(as.character(correlated_Remove(dataframe,varlist,thr=0.99999)),outcome)
  dataframe <- dataframe[,tokeep]

  varlist <- colnames(dataframe)
  varlist <- varlist[varlist != outcome]
  
  iscontinous <- sapply(apply(dataframe,2,unique),length) > 5 ## Only variables with enough samples



dataframeScaled <- FRESAScale(dataframe,method="OrderLogit")$scaledData

1.4 The heatmap of the data

numsub <- nrow(dataframe)
if (numsub > 1000) numsub <- 1000


if (!largeSet)
{

  hm <- heatMaps(data=dataframeScaled[1:numsub,],
                 Outcome=outcome,
                 Scale=TRUE,
                 hCluster = "row",
                 xlab="Feature",
                 ylab="Sample",
                 srtCol=45,
                 srtRow=45,
                 cexCol=cexheat,
                 cexRow=cexheat
                 )
  par(op)
}

1.4.0.1 Correlation Matrix of the Data

The heat map of the data


if (!largeSet)
{

  par(cex=0.6,cex.main=0.85,cex.axis=0.7)
  #cormat <- Rfast::cora(as.matrix(dataframe[,varlist]),large=TRUE)
  cormat <- cor(dataframe[,varlist],method="pearson")
  cormat[is.na(cormat)] <- 0
  gplots::heatmap.2(abs(cormat),
                    trace = "none",
  #                  scale = "row",
                    mar = c(5,5),
                    col=rev(heat.colors(5)),
                    main = "Original Correlation",
                    cexRow = cexheat,
                    cexCol = cexheat,
                     srtCol=45,
                     srtRow=45,
                    key.title=NA,
                    key.xlab="|Pearson Correlation|",
                    xlab="Feature", ylab="Feature")
  diag(cormat) <- 0
  print(max(abs(cormat)))
}

[1] 0.9986573

1.5 The decorrelation


DEdataframe <- IDeA(dataframe,verbose=TRUE,thr=thro)
#> 
#>  Included: 56 , Uni p: 0.01060209 , Uncorrelated Base: 16 , Outcome-Driven Size: 0 , Base Size: 16 
#> 
#> 
 1 <R=0.999,r=0.974,N=    2>, Top: 1( 1 )[ 1 : 1 Fa= 1 : 0.974 ]( 1 , 1 , 0 ),<|>Tot Used: 2 , Added: 1 , Zero Std: 0 , Max Cor: 0.950
#> 
 2 <R=0.950,r=0.925,N=   15>, Top: 7( 1 )[ 1 : 7 Fa= 6 : 0.925 ]( 5 , 7 , 1 ),<|>Tot Used: 14 , Added: 7 , Zero Std: 0 , Max Cor: 0.923
#> 
 3 <R=0.923,r=0.911,N=   15>, Top: 3( 2 )[ 1 : 3 Fa= 7 : 0.911 ]( 3 , 4 , 6 ),<|>Tot Used: 18 , Added: 4 , Zero Std: 0 , Max Cor: 0.911
#> 
 4 <R=0.911,r=0.905,N=   15>, Top: 1( 1 )[ 1 : 1 Fa= 7 : 0.905 ]( 1 , 1 , 7 ),<|>Tot Used: 19 , Added: 1 , Zero Std: 0 , Max Cor: 0.899
#> 
 5 <R=0.899,r=0.850,N=    8>, Top: 4( 1 )[ 1 : 4 Fa= 10 : 0.850 ]( 4 , 4 , 7 ),<|>Tot Used: 25 , Added: 4 , Zero Std: 0 , Max Cor: 0.845
#> 
 6 <R=0.845,r=0.823,N=    8>, Top: 1( 1 )[ 1 : 1 Fa= 11 : 0.823 ]( 1 , 1 , 10 ),<|>Tot Used: 27 , Added: 1 , Zero Std: 0 , Max Cor: 0.819
#> 
 7 <R=0.819,r=0.759,N=    6>, Top: 3( 1 )[ 1 : 3 Fa= 13 : 0.759 ]( 3 , 3 , 11 ),<|>Tot Used: 30 , Added: 3 , Zero Std: 0 , Max Cor: 0.752
#> 
 8 <R=0.752,r=0.726,N=    6>, Top: 3( 1 )[ 1 : 3 Fa= 13 : 0.726 ]( 2 , 2 , 13 ),<|>Tot Used: 31 , Added: 2 , Zero Std: 0 , Max Cor: 0.724
#> 
 9 <R=0.724,r=0.612,N=   23>, Top: 8( 1 )[ 1 : 8 Fa= 16 : 0.612 ]( 8 , 13 , 13 ),<|>Tot Used: 40 , Added: 13 , Zero Std: 0 , Max Cor: 0.970
#> 
 10 <R=0.970,r=0.735,N=   23>, Top: 3( 1 )[ 1 : 3 Fa= 16 : 0.735 ]( 3 , 3 , 16 ),<|>Tot Used: 40 , Added: 3 , Zero Std: 0 , Max Cor: 0.934
#> 
 11 <R=0.934,r=0.717,N=   23>, Top: 1( 1 )[ 1 : 1 Fa= 17 : 0.717 ]( 1 , 1 , 16 ),<|>Tot Used: 40 , Added: 1 , Zero Std: 0 , Max Cor: 0.709
#> 
 12 <R=0.709,r=0.479,N=   22>, Top: 10( 1 )[ 1 : 10 Fa= 21 : 0.479 ]( 8 , 12 , 17 ),<|>Tot Used: 45 , Added: 12 , Zero Std: 0 , Max Cor: 0.877
#> 
 13 <R=0.877,r=0.563,N=   22>, Top: 3( 1 )[ 1 : 3 Fa= 21 : 0.563 ]( 3 , 5 , 21 ),<|>Tot Used: 45 , Added: 5 , Zero Std: 0 , Max Cor: 0.715
#> 
 14 <R=0.715,r=0.482,N=   22>, Top: 3( 1 )[ 1 : 3 Fa= 21 : 0.482 ]( 3 , 3 , 21 ),<|>Tot Used: 45 , Added: 3 , Zero Std: 0 , Max Cor: 0.926
#> 
 15 <R=0.926,r=0.588,N=   22>, Top: 1( 1 )[ 1 : 1 Fa= 21 : 0.588 ]( 1 , 1 , 21 ),<|>Tot Used: 45 , Added: 1 , Zero Std: 0 , Max Cor: 0.727
#> 
 16 <R=0.727,r=0.413,N=   18>, Top: 9( 1 )[ 1 : 9 Fa= 24 : 0.413 ]( 8 , 9 , 21 ),<|>Tot Used: 47 , Added: 9 , Zero Std: 0 , Max Cor: 0.554
#> 
 17 <R=0.554,r=0.400,N=   18>, Top: 4( 1 )[ 1 : 4 Fa= 24 : 0.400 ]( 4 , 4 , 24 ),<|>Tot Used: 47 , Added: 4 , Zero Std: 0 , Max Cor: 0.459
#> 
 18 <R=0.459,r=0.400,N=   18>, Top: 1( 1 )[ 1 : 1 Fa= 25 : 0.400 ]( 1 , 1 , 24 ),<|>Tot Used: 47 , Added: 1 , Zero Std: 0 , Max Cor: 0.395
#> 
 19 <R=0.395,r=0.400,N=    0>
#> 
 [ 19 ], 0.3946587 Decor Dimension: 47 Nused: 47 . Cor to Base: 24 , ABase: 10 , Outcome Base: 0 
#> 
varlistc <- colnames(DEdataframe)[colnames(DEdataframe) != outcome]

pander::pander(sum(apply(dataframe[,varlist],2,var)))

1.71e+09

pander::pander(sum(apply(DEdataframe[,varlistc],2,var)))

8.37e+08

pander::pander(entropy(discretize(unlist(dataframe[,varlist]), 256)))

0.395

pander::pander(entropy(discretize(unlist(DEdataframe[,varlistc]), 256)))

0.948

1.5.1 The decorrelation matrix


if (!largeSet)
{

  par(cex=0.6,cex.main=0.85,cex.axis=0.7)
  
  UPSTM <- attr(DEdataframe,"UPSTM")
  
  gplots::heatmap.2(1.0*(abs(UPSTM)>0),
                    trace = "none",
                    mar = c(5,5),
                    col=rev(heat.colors(5)),
                    main = "Decorrelation matrix",
                    cexRow = cexheat,
                    cexCol = cexheat,
                   srtCol=45,
                   srtRow=45,
                    key.title=NA,
                    key.xlab="|Beta|>0",
                    xlab="Output Feature", ylab="Input Feature")
  
  par(op)
}

1.6 The heatmap of the decorrelated data

if (!largeSet)
{

  hm <- heatMaps(data=DEdataframe[1:numsub,],
                 Outcome=outcome,
                 Scale=TRUE,
                 hCluster = "row",
                 cexRow = cexheat,
                 cexCol = cexheat,
                 srtCol=45,
                 srtRow=45,
                 xlab="Feature",
                 ylab="Sample")
  par(op)
}

1.7 The correlation matrix after decorrelation

if (!largeSet)
{

  cormat <- cor(DEdataframe[,varlistc],method="pearson")
  cormat[is.na(cormat)] <- 0
  
  gplots::heatmap.2(abs(cormat),
                    trace = "none",
                    mar = c(5,5),
                    col=rev(heat.colors(5)),
                    main = "Correlation after IDeA",
                    cexRow = cexheat,
                    cexCol = cexheat,
                     srtCol=45,
                     srtRow=45,
                    key.title=NA,
                    key.xlab="|Pearson Correlation|",
                    xlab="Feature", ylab="Feature")
  
  par(op)
  diag(cormat) <- 0
  print(max(abs(cormat)))
}

[1] 0.6574137

1.8 U-MAP Visualization of features

1.8.1 The UMAP based on LASSO on Raw Data


if (nrow(dataframe) < 1000)
{
  classes <- unique(dataframe[1:numsub,outcome])
  raincolors <- rainbow(length(classes))
  names(raincolors) <- classes
  datasetframe.umap = umap(scale(dataframe[1:numsub,varlist]),n_components=2)
  plot(datasetframe.umap$layout,xlab="U1",ylab="U2",main="UMAP: Original",t='n')
  text(datasetframe.umap$layout,labels=dataframe[1:numsub,outcome],col=raincolors[dataframe[1:numsub,outcome]+1])
}

1.8.2 The decorralted UMAP

if (nrow(dataframe) < 1000)
{

  datasetframe.umap = umap(scale(DEdataframe[1:numsub,varlistc]),n_components=2)
  plot(datasetframe.umap$layout,xlab="U1",ylab="U2",main="UMAP: After IDeA",t='n')
  text(datasetframe.umap$layout,labels=DEdataframe[1:numsub,outcome],col=raincolors[DEdataframe[1:numsub,outcome]+1])
}

1.9 Univariate Analysis

1.9.1 Univariate



univarRAW <- uniRankVar(varlist,
               paste(outcome,"~1"),
               outcome,
               dataframe,
               rankingTest="AUC")



univarDe <- uniRankVar(varlistc,
               paste(outcome,"~1"),
               outcome,
               DEdataframe,
               rankingTest="AUC",
               )

1.9.2 Final Table


univariate_columns <- c("caseMean","caseStd","controlMean","controlStd","controlKSP","ROCAUC")

##topfive
topvar <- c(1:length(varlist)) <= TopVariables
pander::pander(univarRAW$orderframe[topvar,univariate_columns])
  caseMean caseStd controlMean controlStd controlKSP ROCAUC
LBDHDD 56.3 15.86 59.33 16.37 9.15e-05 0.555
BMXWAIST 98.9 17.24 95.77 15.91 9.16e-04 0.555
BMXWT 77.9 21.27 74.42 19.70 1.92e-07 0.551
BMXBMI 30.0 7.74 28.70 6.97 9.56e-06 0.550
BPXPLS 74.7 11.80 72.77 11.16 3.12e-06 0.550
LBXWBCSI 7.6 2.75 7.18 2.10 3.96e-05 0.545
LBDNENO 4.5 1.94 4.19 1.62 3.25e-06 0.542
BMXARMC 32.6 5.62 31.89 5.20 3.12e-05 0.540
RIDAGEYR 47.2 17.10 49.41 17.13 2.92e-04 0.538
URXUCR.x 104.7 71.22 97.45 71.12 1.18e-09 0.536


topLAvar <- univarDe$orderframe$Name[str_detect(univarDe$orderframe$Name,"La_")]
topLAvar <- unique(c(univarDe$orderframe$Name[topvar],topLAvar[1:as.integer(TopVariables/2)]))
finalTable <- univarDe$orderframe[topLAvar,univariate_columns]

theLaVar <- rownames(finalTable)[str_detect(rownames(finalTable),"La_")]

pander::pander(univarDe$orderframe[topLAvar,univariate_columns])
  caseMean caseStd controlMean controlStd controlKSP ROCAUC
LBDHDD 56.346 15.86 59.327 16.37 9.15e-05 0.555
La_MGDCGSZ 70.533 10.42 72.595 9.98 1.26e-01 0.553
BMXWT 77.938 21.27 74.417 19.70 1.92e-07 0.551
BPXPLS 74.653 11.80 72.767 11.16 3.12e-06 0.550
LBXWBCSI 7.599 2.75 7.175 2.10 3.96e-05 0.545
RIDAGEYR 47.216 17.10 49.407 17.13 2.92e-04 0.538
La_LBXRDW 37.247 1.30 37.072 1.12 2.35e-08 0.537
URXUCR.x 104.722 71.22 97.451 71.12 1.18e-09 0.536
La_LBXNEPCT 93.066 2.05 92.818 2.32 9.01e-06 0.535
Diab 953.000 260.00 752.000 129.00 NA 0.534
La_BMXWAIST 112.144 5.99 111.520 6.15 6.60e-01 0.529
La_MGXH2T2 -0.682 1.86 -0.528 1.79 1.65e-07 0.527

dc <- getLatentCoefficients(DEdataframe)
fscores <- attr(DEdataframe,"fscore")

theSigDc <- dc[theLaVar]
names(theSigDc) <- NULL
theSigDc <- unique(names(unlist(theSigDc)))


theFormulas <- dc[rownames(finalTable)]
deFromula <- character(length(theFormulas))
names(deFromula) <- rownames(finalTable)

pander::pander(c(mean=mean(sapply(dc,length)),total=length(dc),fraction=length(dc)/(ncol(dataframe)-1)))
mean total fraction
3.06 35 0.593


allSigvars <- names(dc)



dx <- names(deFromula)[1]
for (dx in names(deFromula))
{
  coef <- theFormulas[[dx]]
  cname <- names(theFormulas[[dx]])
  names(cname) <- cname
  for (cf in names(coef))
  {
    if (cf != dx)
    {
      if (coef[cf]>0)
      {
        deFromula[dx] <- paste(deFromula[dx],
                               sprintf("+ %5.3f*%s",coef[cf],cname[cf]))
      }
      else
      {
        deFromula[dx] <- paste(deFromula[dx],
                               sprintf("%5.3f*%s",coef[cf],cname[cf]))
      }
    }
  }
}

finalTable <- rbind(finalTable,univarRAW$orderframe[theSigDc[!(theSigDc %in% rownames(finalTable))],univariate_columns])


orgnamez <- rownames(finalTable)
orgnamez <- str_remove_all(orgnamez,"La_")
finalTable$RAWAUC <- univarRAW$orderframe[orgnamez,"ROCAUC"]
finalTable$DecorFormula <- deFromula[rownames(finalTable)]
finalTable$fscores <- fscores[rownames(finalTable)]

Final_Columns <- c("DecorFormula","caseMean","caseStd","controlMean","controlStd","controlKSP","ROCAUC","RAWAUC","fscores")

finalTable <- finalTable[order(-finalTable$ROCAUC),]
pander::pander(finalTable[,Final_Columns])
  DecorFormula caseMean caseStd controlMean controlStd controlKSP ROCAUC RAWAUC fscores
LBDHDD 56.346 15.86 59.327 16.37 9.15e-05 0.555 0.555 NA
BMXWAIST NA 98.945 17.24 95.773 15.91 9.16e-04 0.555 0.555 NA
La_MGDCGSZ + 0.314RIDAGEYR + 1.000MGDCGSZ 70.533 10.42 72.595 9.98 1.26e-01 0.553 0.528 5
BMXWT 77.938 21.27 74.417 19.70 1.92e-07 0.551 0.551 5
BPXPLS 74.653 11.80 72.767 11.16 3.12e-06 0.550 0.550 NA
LBXWBCSI 7.599 2.75 7.175 2.10 3.96e-05 0.545 0.545 6
RIDAGEYR 47.216 17.10 49.407 17.13 2.92e-04 0.538 0.538 2
La_LBXRDW + 0.695LBXMC + 1.000LBXRDW 37.247 1.30 37.072 1.12 2.35e-08 0.537 0.513 -1
URXUCR.x 104.722 71.22 97.451 71.12 1.18e-09 0.536 0.536 NA
MGXH2T2 NA 26.701 6.25 27.558 5.78 4.09e-01 0.535 0.535 NA
La_LBXNEPCT + 1.030LBXLYPCT + 1.000LBXNEPCT + 1.153*LBXEOPCT 93.066 2.05 92.818 2.32 9.01e-06 0.535 0.525 3
Diab 953.000 260.00 752.000 129.00 NA 0.534 0.534 NA
La_BMXWAIST -0.741BMXWT + 0.441BMXHT + 1.000*BMXWAIST 112.144 5.99 111.520 6.15 6.60e-01 0.529 0.555 -2
MGDCGSZ NA 55.718 11.88 57.093 11.17 5.87e-01 0.528 0.528 NA
La_MGXH2T2 + 1.000MGXH2T2 + 0.569MGXH1T3 -0.767*MGDCGSZ -0.682 1.86 -0.528 1.79 1.65e-07 0.527 0.535 -2
MGXH1T3 NA 26.972 6.28 27.589 5.85 6.88e-01 0.526 0.526 NA
LBXNEPCT NA 58.254 9.43 57.404 9.06 5.22e-02 0.525 0.525 NA
LBXLYPCT NA 30.957 8.73 31.509 8.18 1.68e-02 0.518 0.518 4
LBXMC NA 33.637 1.07 33.558 0.95 1.82e-04 0.516 0.516 NA
LBXRDW NA 13.864 1.49 13.744 1.32 0.00e+00 0.513 0.513 NA
BMXHT NA 160.982 6.86 160.846 6.95 7.15e-01 0.509 0.509 5
LBXEOPCT NA 2.527 1.79 2.555 1.98 3.00e-13 0.501 0.501 5

1.10 Comparing IDeA vs PCA vs EFA

1.10.1 PCA

featuresnames <- colnames(dataframe)[colnames(dataframe) != outcome]
pc <- prcomp(dataframe[,iscontinous],center = TRUE,tol=0.002)   #principal components
predPCA <- predict(pc,dataframe[,iscontinous])
PCAdataframe <- as.data.frame(cbind(predPCA,dataframe[,!iscontinous]))
colnames(PCAdataframe) <- c(colnames(predPCA),colnames(dataframe)[!iscontinous]) 
#plot(PCAdataframe[,colnames(PCAdataframe)!=outcome],col=dataframe[,outcome],cex=0.65,cex.lab=0.5,cex.axis=0.75,cex.sub=0.5,cex.main=0.75)

#pander::pander(pc$rotation)


PCACor <- cor(PCAdataframe[,colnames(PCAdataframe) != outcome])


  gplots::heatmap.2(abs(PCACor),
                    trace = "none",
  #                  scale = "row",
                    mar = c(5,5),
                    col=rev(heat.colors(5)),
                    main = "PCA Correlation",
                    cexRow = 0.5,
                    cexCol = 0.5,
                     srtCol=45,
                     srtRow= -45,
                    key.title=NA,
                    key.xlab="Pearson Correlation",
                    xlab="Feature", ylab="Feature")

1.10.2 EFA


EFAdataframe <- dataframeScaled

if (length(iscontinous) < 2000)
{
  topred <- min(length(iscontinous),nrow(dataframeScaled),ncol(predPCA)/2)
  if (topred < 2) topred <- 2
  
  uls <- fa(dataframeScaled[,iscontinous],nfactors=topred,rotate="varimax",warnings=FALSE)  # EFA analysis
  predEFA <- predict(uls,dataframeScaled[,iscontinous])
  EFAdataframe <- as.data.frame(cbind(predEFA,dataframeScaled[,!iscontinous]))
  colnames(EFAdataframe) <- c(colnames(predEFA),colnames(dataframeScaled)[!iscontinous]) 


  
  EFACor <- cor(EFAdataframe[,colnames(EFAdataframe) != outcome])
  
  
    gplots::heatmap.2(abs(EFACor),
                      trace = "none",
    #                  scale = "row",
                      mar = c(5,5),
                      col=rev(heat.colors(5)),
                      main = "EFA Correlation",
                      cexRow = 0.5,
                      cexCol = 0.5,
                       srtCol=45,
                       srtRow= -45,
                      key.title=NA,
                      key.xlab="Pearson Correlation",
                      xlab="Feature", ylab="Feature")
}

1.11 Effect on CAR modeling

par(op)
par(xpd = TRUE)
dataframe[,outcome] <- factor(dataframe[,outcome])
rawmodel <- rpart(paste(outcome,"~."),dataframe,control=rpart.control(maxdepth=3))
pr <- predict(rawmodel,dataframe,type = "class")

  ptab <- list(er="Error",detail=matrix(nrow=6,ncol=1))
  if (length(unique(pr))>1)
  {
    plot(rawmodel,main="Raw",branch=0.5,uniform = TRUE,compress = TRUE,margin=0.1)
    text(rawmodel, use.n = TRUE,cex=0.75)
    ptab <- epiR::epi.tests(table(pr==0,dataframe[,outcome]==0))
  }


pander::pander(table(dataframe[,outcome],pr))
  0 1
0 205 676
1 160 1053
pander::pander(ptab)
  • detail:

    statistic est lower upper
    ap 0.826 0.8088 0.842
    tp 0.579 0.5578 0.601
    se 0.868 0.8477 0.887
    sp 0.233 0.2052 0.262
    diag.ac 0.601 0.5794 0.622
    diag.or 1.996 1.5886 2.507
    nndx 9.922 6.7264 18.904
    youden 0.101 0.0529 0.149
    pv.pos 0.609 0.5856 0.632
    pv.neg 0.562 0.5090 0.613
    lr.pos 1.131 1.0843 1.180
    lr.neg 0.567 0.4699 0.684
    p.rout 0.174 0.1583 0.191
    p.rin 0.826 0.8088 0.842
    p.tpdn 0.767 0.7380 0.795
    p.tndp 0.132 0.1134 0.152
    p.dntp 0.391 0.3679 0.414
    p.dptn 0.438 0.3868 0.491
  • tab:

      Outcome + Outcome - Total
    Test + 1053 676 1729
    Test - 160 205 365
    Total 1213 881 2094
  • method: exact

  • digits: 2

  • conf.level: 0.95

pander::pander(ptab$detail[c(5,3,4,6),])
  statistic est lower upper
5 diag.ac 0.601 0.579 0.622
3 se 0.868 0.848 0.887
4 sp 0.233 0.205 0.262
6 diag.or 1.996 1.589 2.507

par(op)
par(xpd = TRUE)
DEdataframe[,outcome] <- factor(DEdataframe[,outcome])
IDeAmodel <- rpart(paste(outcome,"~."),DEdataframe,control=rpart.control(maxdepth=3))
pr <- predict(IDeAmodel,DEdataframe,type = "class")

  ptab <- list(er="Error",detail=matrix(nrow=6,ncol=1))
  if (length(unique(pr))>1)
  {
    plot(IDeAmodel,main="IDeA",branch=0.5,uniform = TRUE,compress = TRUE,margin=0.1)
    text(IDeAmodel, use.n = TRUE,cex=0.75)
    ptab <- epiR::epi.tests(table(pr==0,DEdataframe[,outcome]==0))
  }

pander::pander(table(DEdataframe[,outcome],pr))
  0 1
0 411 470
1 366 847
pander::pander(ptab)
  • detail:

    statistic est lower upper
    ap 0.629 0.608 0.650
    tp 0.579 0.558 0.601
    se 0.698 0.672 0.724
    sp 0.467 0.433 0.500
    diag.ac 0.601 0.579 0.622
    diag.or 2.024 1.690 2.424
    nndx 6.069 4.462 9.548
    youden 0.165 0.105 0.224
    pv.pos 0.643 0.617 0.669
    pv.neg 0.529 0.493 0.565
    lr.pos 1.309 1.218 1.407
    lr.neg 0.647 0.579 0.723
    p.rout 0.371 0.350 0.392
    p.rin 0.629 0.608 0.650
    p.tpdn 0.533 0.500 0.567
    p.tndp 0.302 0.276 0.328
    p.dntp 0.357 0.331 0.383
    p.dptn 0.471 0.435 0.507
  • tab:

      Outcome + Outcome - Total
    Test + 847 470 1317
    Test - 366 411 777
    Total 1213 881 2094
  • method: exact

  • digits: 2

  • conf.level: 0.95

pander::pander(ptab$detail[c(5,3,4,6),])
  statistic est lower upper
5 diag.ac 0.601 0.579 0.622
3 se 0.698 0.672 0.724
4 sp 0.467 0.433 0.500
6 diag.or 2.024 1.690 2.424

par(op)
par(xpd = TRUE)
PCAdataframe[,outcome] <- factor(PCAdataframe[,outcome])
PCAmodel <- rpart(paste(outcome,"~."),PCAdataframe,control=rpart.control(maxdepth=3))
pr <- predict(PCAmodel,PCAdataframe,type = "class")
ptab <- list(er="Error",detail=matrix(nrow=6,ncol=1))
if (length(unique(pr))>1)
{
  plot(PCAmodel,main="PCA",branch=0.5,uniform = TRUE,compress = TRUE,margin=0.1)
  text(PCAmodel, use.n = TRUE,cex=0.75)
  ptab <- epiR::epi.tests(table(pr==0,PCAdataframe[,outcome]==0))
}
pander::pander(table(PCAdataframe[,outcome],pr))
  0 1
0 0 881
1 0 1213
pander::pander(ptab)
  • er: Error

  • detail:

    NA
    NA
    NA
    NA
    NA
    NA
pander::pander(ptab$detail[c(5,3,4,6),])

NA, NA, NA and NA



par(op)

1.11.1 EFA


  EFAdataframe[,outcome] <- factor(EFAdataframe[,outcome])
  EFAmodel <- rpart(paste(outcome,"~."),EFAdataframe,control=rpart.control(maxdepth=3))
  pr <- predict(EFAmodel,EFAdataframe,type = "class")
  
  ptab <- list(er="Error",detail=matrix(nrow=6,ncol=1))
  if (length(unique(pr))>1)
  {
    plot(EFAmodel,main="EFA",branch=0.5,uniform = TRUE,compress = TRUE,margin=0.1)
    text(EFAmodel, use.n = TRUE,cex=0.75)
    ptab <- epiR::epi.tests(table(pr==0,EFAdataframe[,outcome]==0))
  }

  pander::pander(table(EFAdataframe[,outcome],pr))
  0 1
0 0 881
1 0 1213
  pander::pander(ptab)
  • er: Error

  • detail:

    NA
    NA
    NA
    NA
    NA
    NA
  pander::pander(ptab$detail[c(5,3,4,6),])

NA, NA, NA and NA

  par(op)

1.12 The Latent formulas

theLaFormulas <- getLatentCoefficients(DEdataframe)
                       

## Add the variable description

allvardes <- rbind(demographiDesc[,1:2],examinationDesc[,1:2],labsDesc[,1:2])
dup <- duplicated(allvardes[,1])
allvardes <- allvardes[!dup,]
rownames(allvardes) <- allvardes[,1]
lavar <-  names(theLaFormulas)[1]
theLaFormulas[[lavar]]
#>  WTINT2YR  WTMEC2YR 
#> -1.025106  1.000000
for (lavar in names(theLaFormulas))
{
  theLaFormulas[[lavar]]$Desc <- paste("(",allvardes[names(theLaFormulas[[lavar]]),2],")\n\r",sep="")
}

pander::pander(theLaFormulas)
  • La_WTMEC2YR:

    • WTINT2YR: -1.03
    • WTMEC2YR: 1
    • Desc: _(Full sample 2 year interview weight.)

_ and _(Full sample 2 year MEC exam weight.)

_

  • La_BPXML1:

    • RIDAGEYR: -0.523
    • BPXML1: 1
    • Desc: _(Age in years of the participant at the time of screening. Individuals 80 and over are topcoded at 80 years of age.)

_ and _(MIL: maximum inflation levels (mm Hg))

_

  • La_BMXBMI:

    • BMXWT: -0.375
    • BMXHT: 0.319
    • BMXBMI: 1
    • Desc: _(Weight (kg))

, (Standing Height (cm))

_ and _(Body Mass Index (kg/m**2))

_

  • La_BMXLEG:

    • BMXHT: -0.299
    • BMXLEG: 1
    • Desc: _(Standing Height (cm))

_ and _(Upper Leg Length (cm))

_

  • La_BMXARML:

    • BMXWT: -0.0398
    • BMXHT: -0.217
    • BMXARML: 1
    • Desc: _(Weight (kg))

, (Standing Height (cm))

_ and _(Upper Arm Length (cm))

_

  • La_BMXARMC:

    • BMXWT: -0.24
    • BMXHT: 0.132
    • BMXARMC: 1
    • Desc: _(Weight (kg))

, (Standing Height (cm))

_ and _(Arm Circumference (cm))

_

  • La_BMXWAIST:

    • BMXWT: -0.741
    • BMXHT: 0.441
    • BMXWAIST: 1
    • Desc: _(Weight (kg))

, (Standing Height (cm))

_ and _(Waist Circumference (cm))

_

  • La_MGXH1T1:

    • MGXH1T1: 1
    • MGDCGSZ: -0.453
    • Desc: _(Grip strength (kg), hand 1, test 1)

_ and _(Combined grip strength (kg): the sum of the largest reading from each hand.)

_

  • La_MGXH2T1:

    • MGXH2T1: 1
    • MGXH1T3: 0.606
    • MGDCGSZ: -0.77
    • Desc: _(Grip strength (kg), hand 2, test 1)

, (Grip strength (kg), hand 1, test 3)

_ and _(Combined grip strength (kg): the sum of the largest reading from each hand.)

_

  • La_MGXH1T2:

    • MGXH1T2: 1
    • MGXH1T3: -0.508
    • MGDCGSZ: -0.224
    • Desc: _(Grip strength (kg), hand 1, test 2)

, (Grip strength (kg), hand 1, test 3)

_ and _(Combined grip strength (kg): the sum of the largest reading from each hand.)

_

  • La_MGXH2T2:

    • MGXH2T2: 1
    • MGXH1T3: 0.569
    • MGDCGSZ: -0.767
    • Desc: _(Grip strength (kg), hand 2, test 2)

, (Grip strength (kg), hand 1, test 3)

_ and _(Combined grip strength (kg): the sum of the largest reading from each hand.)

_

  • La_MGXH1T3:

    • MGXH1T3: 1
    • MGDCGSZ: -0.492
    • Desc: _(Grip strength (kg), hand 1, test 3)

_ and _(Combined grip strength (kg): the sum of the largest reading from each hand.)

_

  • La_MGXH2T3:

    • MGXH1T2: 0.409
    • MGXH1T3: 0.364
    • MGXH2T3: 1
    • MGDCGSZ: -0.862
    • Desc: _(Grip strength (kg), hand 1, test 2)

, (Grip strength (kg), hand 1, test 3)

, (Grip strength (kg), hand 2, test 3)

_ and _(Combined grip strength (kg): the sum of the largest reading from each hand.)

_

  • La_MGDCGSZ:

    • RIDAGEYR: 0.314
    • MGDCGSZ: 1
    • Desc: _(Age in years of the participant at the time of screening. Individuals 80 and over are topcoded at 80 years of age.)

_ and _(Combined grip strength (kg): the sum of the largest reading from each hand.)

_

  • La_OHX04TC:

    • OHX04TC: 1
    • OHX11TC: -0.732
    • Desc: _(Tooth Count: Upper right 2nd bicuspid/2nd primary molar (2B))

_ and _(Tooth Count: Upper left cuspid (C))

_

  • La_OHX06TC:

    • OHX06TC: 1
    • OHX11TC: -0.811
    • Desc: _(Tooth Count: Upper right cuspid (C))

_ and _(Tooth Count: Upper left cuspid (C))

_

  • La_OHX10TC:

    • OHX10TC: 1
    • OHX11TC: -0.776
    • Desc: _(Tooth Count: Upper left lateral incisor (LI))

_ and _(Tooth Count: Upper left cuspid (C))

_

  • La_OHX13TC:

    • OHX04TC: -0.652
    • OHX13TC: 1
    • Desc: _(Tooth Count: Upper right 2nd bicuspid/2nd primary molar (2B))

_ and _(Tooth Count: Upper left 2nd bicuspid/2nd primary molar (2B))

_

  • La_OHX20TC:

    • OHX20TC: 1
    • OHX29TC: -0.631
    • Desc: _(Tooth Count: Lower left 2nd bicuspid/2nd primary molar (2B))

_ and _(Tooth Count: Lower right 2nd bicuspid/2nd primary molar (2B))

_

  • La_OHX29TC:

    • OHX11TC: -0.659
    • OHX29TC: 1
    • Desc: _(Tooth Count: Upper left cuspid (C))

_ and _(Tooth Count: Lower right 2nd bicuspid/2nd primary molar (2B))

_

  • La_bpresSYM:

    • BPXML1: -0.941
    • bpresSYM: 1
    • Desc: _(MIL: maximum inflation levels (mm Hg))

_ and _(NA)

_

  • La_URDACT:

    • URXUMA: -1.17
    • URDACT: 1
    • Desc: _(Albumin, urine (ug/mL))

_ and _(Albumin creatinine ratio (mg/g))

_

  • La_LBXMOPCT:

    • LBXLYPCT: 0.995
    • LBXMOPCT: 1
    • LBXNEPCT: 0.994
    • LBXEOPCT: 1
    • LBXBAPCT: 0.956
    • Desc: _(Lymphocyte percent (%))

, (Monocyte percent (%))

, (Segmented neutrophils percent (%))

, (Eosinophils percent (%))

_ and _(Basophils percent (%))

_

  • La_LBXNEPCT:

    • LBXLYPCT: 1.03
    • LBXNEPCT: 1
    • LBXEOPCT: 1.15
    • Desc: _(Lymphocyte percent (%))

, (Segmented neutrophils percent (%))

_ and _(Eosinophils percent (%))

_

  • La_LBDLYMNO:

    • LBXWBCSI: -0.326
    • LBXLYPCT: -0.0873
    • LBDLYMNO: 1
    • Desc: _(White blood cell count (1000 cells/uL))

, (Lymphocyte percent (%))

_ and _(Lymphocyte number (1000 cells/uL))

_

  • La_LBDMONO:

    • LBXWBCSI: -0.523
    • LBXLYPCT: 0.031
    • LBXNEPCT: 0.0285
    • LBXEOPCT: 0.0763
    • LBDLYMNO: 0.49
    • LBDMONO: 1
    • LBDNENO: 0.517
    • Desc: _(White blood cell count (1000 cells/uL))

, (Lymphocyte percent (%))

, (Segmented neutrophils percent (%))

, (Eosinophils percent (%))

, (Lymphocyte number (1000 cells/uL))

, (Monocyte number (1000 cells/uL))

_ and _(Segmented neutrophils num (1000 cell/uL))

_

  • La_LBDNENO:

    • LBXWBCSI: -0.898
    • LBXLYPCT: -0.0534
    • LBXNEPCT: -0.0548
    • LBXEOPCT: 0.0207
    • LBDLYMNO: 0.947
    • LBDNENO: 1
    • Desc: _(White blood cell count (1000 cells/uL))

, (Lymphocyte percent (%))

, (Segmented neutrophils percent (%))

, (Eosinophils percent (%))

, (Lymphocyte number (1000 cells/uL))

_ and _(Segmented neutrophils num (1000 cell/uL))

_

  • La_LBDEONO:

    • LBXWBCSI: -0.292
    • LBXLYPCT: -0.0162
    • LBXNEPCT: -0.0167
    • LBXEOPCT: -0.0658
    • LBDLYMNO: 0.288
    • LBDNENO: 0.304
    • LBDEONO: 1
    • Desc: _(White blood cell count (1000 cells/uL))

, (Lymphocyte percent (%))

, (Segmented neutrophils percent (%))

, (Eosinophils percent (%))

, (Lymphocyte number (1000 cells/uL))

, (Segmented neutrophils num (1000 cell/uL))

_ and _(Eosinophils number (1000 cells/uL))

_

  • La_LBXHGB:

    • LBXRBCSI: -1.57
    • LBXHGB: 1
    • Desc: _(Red blood cell count (million cells/uL))

_ and _(Hemoglobin (g/dL))

_

  • La_LBXHCT:

    • LBXHGB: -2.9
    • LBXHCT: 1
    • LBXMC: 1.06
    • Desc: _(Hemoglobin (g/dL))

, (Hematocrit (%))

_ and _(Mean cell hemoglobin concentration (g/dL))

_

  • La_LBXMCVSI:

    • LBXRBCSI: 2.52
    • LBXHGB: 1.03
    • LBXHCT: -0.651
    • LBXMCVSI: 1
    • LBXMCHSI: -2.65
    • LBXMC: 1.97
    • Desc: _(Red blood cell count (million cells/uL))

, (Hemoglobin (g/dL))

, (Hematocrit (%))

, (Mean cell volume (fL))

, (Mean cell hemoglobin (pg))

_ and _(Mean cell hemoglobin concentration (g/dL))

_

  • La_LBXMCHSI:

    • LBXRBCSI: 3.71
    • LBXHGB: -1.04
    • LBXMCHSI: 1
    • LBXMC: -1.78
    • Desc: _(Red blood cell count (million cells/uL))

, (Hemoglobin (g/dL))

, (Mean cell hemoglobin (pg))

_ and _(Mean cell hemoglobin concentration (g/dL))

_

  • La_LBXMC:

    • LBXRBCSI: 3.6
    • LBXHGB: -1.22
    • LBXMCHSI: 0.553
    • LBXMC: 0.0174
    • Desc: _(Red blood cell count (million cells/uL))

, (Hemoglobin (g/dL))

, (Mean cell hemoglobin (pg))

_ and _(Mean cell hemoglobin concentration (g/dL))

_

  • La_LBXRDW:

    • LBXMC: 0.695
    • LBXRDW: 1
    • Desc: _(Mean cell hemoglobin concentration (g/dL))

_ and _(Red cell distribution width (%))

_

  • La_LBXMPSI:

    • LBXPLTSI: 0.00681
    • LBXMPSI: 1
    • Desc: _(Platelet count (1000 cells/uL))

_ and _(Mean platelet volume (fL))

_